This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.
Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.
They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.
Import of the expression data and the N-responsive genes and regulators :
load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426 45
load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426 201
ALPHAS <- seq(0,1, by = 0.1)
nCores = 45
mats <- list()
nrep <- 100
for(alpha in ALPHAS){ # exploring PWM integration strength
for(rep in 1:nrep){ # exploring inherent variability
mat_rf <- bRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mat_rf_perm <- bRF_inference(counts, genes, tfs, nTrees = 2000,
alpha = alpha, tf_expression_permutation = TRUE,
pwm_occurrence = pwm_occurrence,
nCores = nCores,
importance = "%IncMSE")
mats[[paste0("bRF_", as.character(alpha), '_trueData_', rep)]] <- mat_rf
mats[[paste0("bRF_", as.character(alpha), '_shuffled_', rep)]] <- mat_rf_perm
}
}
save(mats, file = "results/100_permutations_bRF_importances.rdata")
load( "results/100_permutations_bRF_importances.rdata")
edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)
for(name in names(mats)){ # exploring importance threshold stringency
for(density in densities){
edges[[paste0(name, '_', density)]] <-
bRF_network(mats[[name]], density = density, pwm_occurrence, genes, tfs)
}
}
save(edges, file = "results/100_permutations_bRF_edges.rdata")
names(edges)
# compure precision and recall
load( "results/100_permutations_bRF_importances.rdata")
load("results/clusters_mse_bRF_100permutations.rdata")
positive_genes <- names(clusters_rf[clusters_rf==2])
edges <- list()
densities <- c(0.005, 0.01,0.05,0.075)
for(name in names(mats)){ # exploring importance threshold stringency
for(density in densities){
edges[[paste0(name, '_', density)]] <-
bRF_network(mats[[name]][,positive_genes], density = density, pwm_occurrence, positive_genes, tfs)
}
}
save(edges, file = "results/100_permutations_bRF_edges_positive_same_density.rdata")
# compure precision and recall
color_palette = c("#C12131", "#EC5D2F", "#FE945C", "#FFC08E" )
prettyZero <- function(l){
max.decimals = max(nchar(str_extract(l, "\\.[0-9]+")), na.rm = T)-1
lnew = formatC(l, replace.zero = T, zero.print = "0",
digits = max.decimals, format = "f", preserve.width=T)
return(lnew)
}
settings <- c("method", "alpha", "dataset","rep", "density")
load("results/100_permutations_bRF_edges.rdata")
# number of edges per network
nrows <-
data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
n_edges = unlist(lapply(edges, FUN = nrow)))
nrows[, settings] <-
str_split_fixed(nrows$alpha_rep, '_', length(settings))
edges_num <- lapply(edges, function(df)
df[sapply(df, is.numeric)])
pwm_support <-
data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
str_split_fixed(pwm_support$alpha_rep, '_', length(settings))
pwm_support %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha),
density_label = paste(density, ':', n_edges, 'edges')) %>%
ggplot(aes(color = density_label, x = alpha, y = pwm)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point() +
geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.position = "top",
legend.text = element_text(size = 15),
axis.text = element_text(size = 12)
) +
xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
ggtitle("Average PWM support of inferred edges") +
guides(color = guide_legend(nrow = 2, byrow = TRUE),
fill = guide_legend(nrow = 2, byrow = TRUE)) +
ylab(expression(paste("mean(", pi[tr], ")"))) +
scale_x_continuous(labels = prettyZero) +
scale_color_manual(name = "Network density", values = color_palette) +
scale_fill_manual(name = "Network density", values = color_palette)
## Computing validation metrics
val_conn <-
evaluate_networks(
edges,
validation = c("TARGET", "CHIPSeq", "DAPSeq"),
nCores = 15,
input_genes = genes,
input_tfs = tfs
)
val_conn[, settings] <-
str_split_fixed(val_conn$network_name, '_', length(settings))
save(val_conn, file = "results/100_permutations_bRF_validation.rdata")
val_chip <-
evaluate_networks(
edges,
validation = c("CHIPSeq"),
nCores = 15,
input_genes = genes,
input_tfs = tfs
)
val_chip[, settings] <-
str_split_fixed(val_chip$network_name, '_', length(settings))
save(val_chip, file = "results/100_permutations_bRF_validation_chip.rdata")
val_target <-
evaluate_networks(
edges,
validation = c("TARGET"),
nCores = 15,
input_genes = genes,
input_tfs = tfs
)
val_target[, settings] <-
str_split_fixed(val_target$network_name, '_', length(settings))
save(val_target, file = "results/100_permutations_bRF_validation_target.rdata")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 15,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_bRF_validation_dap.rdata")
edges <- edges[!str_detect(names(edges), '_0.075|0.05')]
draw_precision_recall <- function(validation, data_type){
data_val <- validation %>%
group_by(alpha, dataset, density) %>%
mutate(mean_precision = mean(precision, na.rm = T),
sd_precision = sd(precision, na.rm = T),
mean_recall = mean(recall, na.rm = T),
sd_recall = sd(recall, na.rm = T)) %>%
dplyr::select(-network_name) %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha)) %>%
mutate(density_label = paste(density, ':', n_edges, 'edges'))
precision <- data_val %>%
ggplot(aes(
x = as.numeric(alpha),
y = precision,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_ribbon(aes(ymin = mean_precision - sd_precision ,
ymax = mean_precision + sd_precision ),
alpha = .4) +theme_pubr(legend = "top")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
xlab(expression(alpha)) + ylab("Precision") +
ggtitle(paste("Precision against", data_type)) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'top'
)
recall <- data_val%>%
ggplot(aes(
x = as.numeric(alpha),
y = recall,
color = dataset,
fill = dataset
))+
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8,
nest_line = T, scales = "free") +
geom_ribbon(aes(ymin = mean_recall - sd_recall ,
ymax = mean_recall + sd_recall ),
alpha = .4) +theme_pubr(legend = "none")+
geom_point(alpha = 0.1) + geom_smooth(se=F)+xlab("alpha") +
xlab(expression(alpha)) + ylab("Recall") +
ggtitle(paste("Recall against", data_type)) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.text = element_text(size = 15),
axis.text = element_text(size = 12),
legend.position = 'none'
)
return(precision/recall)
}
load("results/100_permutations_bRF_validation.rdata")
draw_precision_recall(val_conn, "ConnecTF")
load(file = "results/100_permutations_bRF_validation_cluster_pos.rdata")
draw_precision_recall(val_conn_pos, "ConnecTF, positive genes")
load("results/100_permutations_bRF_validation_chip.rdata")
draw_precision_recall(val_chip, "CHIP-Seq")
load(file = "results/100_permutations_bRF_validation_chip_cluster_pos.rdata")
draw_precision_recall(val_chip, "CHIP-Seq, positive genes")
load("results/100_permutations_bRF_validation_target.rdata")
draw_precision_recall(val_target, "TARGET")
load(file = "results/100_permutations_bRF_validation_target_cluster_pos.rdata")
draw_precision_recall(val_target, "TARGET, positive genes")
load("results/100_permutations_bRF_validation_dap.rdata")
draw_precision_recall(val_dap, "DAP-Seq")
load(file = "results/100_permutations_bRF_validation_dap_cluster_pos.rdata")
draw_precision_recall(val_dap, "DAP-Seq, positive genes")
The same densities are obtained (number of edges), but after filtering edges that only point toward a group from the positive cluster of genes obtained from MSE permutations.
load("results/100_permutations_bRF_edges_positive_same_density.rdata")
# number of edges per network
nrows <-
data.frame(alpha_rep = names(unlist(lapply(edges, FUN = nrow))),
n_edges = unlist(lapply(edges, FUN = nrow)))
nrows[, settings] <-
str_split_fixed(nrows$alpha_rep, '_', length(settings))
edges_num <- lapply(edges, function(df)
df[sapply(df, is.numeric)])
pwm_support <-
data.frame(alpha_rep = names(unlist(lapply(edges_num, FUN = nrow))),
pwm = unlist(lapply(edges_num, FUN = colMeans)))
pwm_support[, settings] <-
str_split_fixed(pwm_support$alpha_rep, '_', length(settings))
pwm_support %>%
left_join(nrows, by = settings) %>%
mutate(alpha = as.numeric(alpha),
density_label = paste(density, ':', n_edges, 'edges')) %>%
ggplot(aes(color = density_label, x = alpha, y = pwm)) +
ggh4x::facet_nested_wrap(vars(method, density), ncol = 8, nest_line = T) +
geom_point() +
geom_smooth(aes(fill = density_label, linetype =dataset) , alpha = 0.1) +
theme(
strip.background = element_blank(),
axis.title.x = element_text(size = 22),
title = element_text(size = 16),
strip.text = element_text(size = 16),
legend.position = "top",
legend.text = element_text(size = 15),
axis.text = element_text(size = 12)
) +
xlab(expression(alpha)) + ylab("Mean PWM score in selected edges") +
ggtitle("Average PWM support of inferred edges") +
guides(color = guide_legend(nrow = 2, byrow = TRUE),
fill = guide_legend(nrow = 2, byrow = TRUE)) +
ylab(expression(paste("mean(", pi[tr], ")"))) +
scale_x_continuous(labels = prettyZero)
Not all densities should be explored with this restriction of genes, as there are too few genes to grant sufficient interactions supported by a prior of 1. We keep only the two smallest densities.
#frees up RAM
# edges <- edges[!str_detect(names(edges), '_0.075|0.05')]
val_conn_pos <-
evaluate_networks(
edges,
validation = c("TARGET", "CHIPSeq", "DAPSeq"),
nCores = 10,
input_genes = genes,
input_tfs = tfs
)
val_conn_pos[, settings] <-
str_split_fixed(val_conn_pos$network_name, '_', length(settings))
save(val_conn_pos, file = "results/100_permutations_bRF_validation_cluster_pos.rdata")
val_chip <-
evaluate_networks(
edges,
validation = c("CHIPSeq"),
nCores = 10,
input_genes = genes,
input_tfs = tfs
)
val_chip[, settings] <-
str_split_fixed(val_chip$network_name, '_', length(settings))
save(val_chip, file = "results/100_permutations_bRF_validation_chip_cluster_pos.rdata")
val_target <-
evaluate_networks(
edges,
validation = c("TARGET"),
nCores = 10,
input_genes = genes,
input_tfs = tfs
)
val_target[, settings] <-
str_split_fixed(val_target$network_name, '_', length(settings))
save(val_target, file = "results/100_permutations_bRF_validation_target_cluster_pos.rdata")
val_dap <-
evaluate_networks(
edges,
validation = c("DAPSeq"),
nCores = 10,
input_genes = genes,
input_tfs = tfs
)
val_dap[, settings] <-
str_split_fixed(val_dap$network_name, '_', length(settings))
save(val_dap, file = "results/100_permutations_bRF_validation_dap_cluster_pos.rdata")